Antonio Bartolomé Redondo
Daniel Buendía Ureña
Somos una aseguradora que opera en el mercado europeo y tiene pensado llevar a cabo un proceso de internacionalización a través de la adquisición de una aseguradora canadiense como filial. El consejo de administración, con el objetivo de evaluar esta operación, ha solicitado al departamento de Data Science que realice un estudio y una posterior modelización de los accidentes acaecidos en el país entre 1999 y 2014 con el objetivo de determinar su grado de mortalidad.
Para ello, el equipo de Data Science va a utilizar los datos recogidos en el siguiente enlace base de datos. La base de datos contiene los datos de colisiones de accidentes de tráfico en Canadá de 1999 a 2014, proporcionados por Transport Canada. Este conjunto de datos ofrece diversas características, como la hora del día, si hubo o no víctimas mortales, el sexo del conductor, etc. Los códigos de las diferentes categorías se detallarán a continuación en el diccionario de datos.
# Carga de librerías necesarias
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
import plotly.express as px
from sklearn.impute import SimpleImputer
import warnings
warnings.filterwarnings('ignore')
# Declaración de funciones
def plot_feature(df, col_name, isContinuous, target):
"""
Muestra una representación gráfica de cada variable de manera individual y con respecto a la variable objetivo.
- df: dataframe
- col_name: el nombre de la/s variable/s que se quieren representar
- isContinuous: True si la variable es numérica, False en cualquier otro caso
"""
f, (ax1, ax2) = plt.subplots(nrows=1, ncols=2, figsize=(12,3), dpi=90)
count_null = df[col_name].isnull().sum()
if isContinuous:
sns.histplot(df.loc[df[col_name].notnull(), col_name], kde=False, ax=ax1)
else:
sns.countplot(df[col_name], order=sorted(df[col_name].unique()), color='#5975A4', saturation=1, ax=ax1)
ax1.set_xlabel(col_name)
ax1.set_ylabel('Count')
ax1.set_title(col_name+ ' Numero de nulos: '+str(count_null))
plt.xticks(rotation = 90)
if isContinuous:
sns.boxplot(x=col_name, y=target, data=df, ax=ax2)
ax2.set_ylabel('')
ax2.set_title(col_name + ' by '+target)
else:
data = df.groupby(col_name)[target].value_counts(normalize=True).to_frame('proportion').reset_index()
data.columns = [i, target, 'proportion']
sns.barplot(x = col_name, y = 'proportion', hue= target, data = data, saturation=1, ax=ax2)
ax2.set_ylabel(target+' fraction')
ax2.set_title(target)
plt.xticks(rotation = 90)
ax2.set_xlabel(col_name)
plt.tight_layout()
def get_deviation_of_mean_perc(data, list_var_continuous, target, multiplier):
"""
Devuelve un dataframe con un conteo de los outliers por cada variable, así como el porcentaje que suponen
respecto al total de valores de la variable y su distribución respecto al target
-data: dataframe
-list_var_continuous: lista de las variables numéricas
-target: variable objetivo
-multiplier: multiplicador que se quiere aplicar sobre la desviación típica
"""
pd_final = pd.DataFrame()
for i in list_var_continuous:
series_mean = data[i].mean()
series_std = data[i].std()
std_amp = multiplier * series_std
left = series_mean - std_amp
right = series_mean + std_amp
size_s = data[i].size
perc_goods = data[i][(data[i] >= left) & (data[i] <= right)].size/size_s
perc_excess = data[i][(data[i] < left) | (data[i] > right)].size/size_s
if perc_excess>0:
pd_concat_percent = pd.DataFrame(data[target][(data[i] < left) | (data[i] > right)]\
.value_counts(normalize=True).reset_index()).T
pd_concat_percent.columns = [pd_concat_percent.iloc[0,0],
pd_concat_percent.iloc[0,1]]
pd_concat_percent = pd_concat_percent.drop('index',axis=0)
pd_concat_percent['variable'] = i
pd_concat_percent['sum_outlier_values'] = data[i][(data[i] < left) | (data[i] > right)].size
pd_concat_percent['porcentaje_sum_null_values'] = perc_excess
pd_final = pd.concat([pd_final, pd_concat_percent], axis=0).reset_index(drop=True)
if pd_final.empty:
print('No existen variables con valores nulos')
return pd_final
def get_percent_null_values_target(data, features, target):
"""
Devuelve un dataframe con un conteo de los valores nulos por cada variable, así como el porcentaje que suponen
respecto al total de valores de la variable y su distribución respecto al target
-data: dataframe
-features: lista de las variables que se quieren procesar
-target: variable objetivo
"""
pd_final = pd.DataFrame()
for i in features:
if data[i].isnull().sum()>0:
pd_concat_percent = pd.DataFrame(data[target][data[i].isnull()]\
.value_counts(normalize=True).reset_index()).T
pd_concat_percent.columns = [pd_concat_percent.iloc[0,0],
pd_concat_percent.iloc[0,1]]
pd_concat_percent = pd_concat_percent.drop('index',axis=0)
pd_concat_percent['variable'] = i
pd_concat_percent['Number_of_NA'] = data[i].isnull().sum()
pd_concat_percent['NA_percentage'] = data[i].isnull().sum()/data.shape[0]
pd_final = pd.concat([pd_final, pd_concat_percent], axis=0).reset_index(drop=True)
pd_final = pd_final.sort_values(['Number_of_NA','NA_percentage'], ascending=False)
if pd_final.empty:
print('No existen variables con valores nulos')
return pd_final
def categorical_na_imputer(feature, data, NA_df):
"""
Esta función está diseñada para imputar los valores nulos de una variable categórica generando aleatoriamente valores
no nulos de la variable en función del peso de éstos, que representa la frecuencia de aparición en la variable
-data: dataframe
-feature: variable sobre la que se quiere imputar
-NA_df: dataframe que contenga el número de NAs de la variable
"""
df_values = pd.DataFrame(data[feature].value_counts().sort_values(ascending=False))
df_values['porcentaje_values'] = df_values[feature]/df_values[feature].sum()
list_values = df_values.index.to_numpy()
list_weights = df_values["porcentaje_values"].to_numpy()
na_number = NA_df['Number_of_NA'][NA_df['variable'] == feature].values[0]
np.random.seed(seed=12345)
new_values = np.random.choice(a=list_values, size=na_number, replace=True, p=list_weights).tolist()
index_values = data.index[data[feature].isna() == True].tolist()
na_fill_values = pd.Series(new_values, index_values)
data[feature] = data[feature].fillna(na_fill_values)
# Lectura de los datos
data = pd.read_csv('data/NCDB_1999_to_2014.csv', low_memory=False)
data
| C_YEAR | C_MNTH | C_WDAY | C_HOUR | C_SEV | C_VEHS | C_CONF | C_RCFG | C_WTHR | C_RSUR | ... | V_ID | V_TYPE | V_YEAR | P_ID | P_SEX | P_AGE | P_PSN | P_ISEV | P_SAFE | P_USER | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1999 | 01 | 1 | 20 | 2 | 02 | 34 | UU | 1 | 5 | ... | 01 | 06 | 1990 | 01 | M | 41 | 11 | 1 | UU | 1 |
| 1 | 1999 | 01 | 1 | 20 | 2 | 02 | 34 | UU | 1 | 5 | ... | 02 | 01 | 1987 | 01 | M | 19 | 11 | 1 | UU | 1 |
| 2 | 1999 | 01 | 1 | 20 | 2 | 02 | 34 | UU | 1 | 5 | ... | 02 | 01 | 1987 | 02 | F | 20 | 13 | 2 | 02 | 2 |
| 3 | 1999 | 01 | 1 | 08 | 2 | 01 | 01 | UU | 5 | 3 | ... | 01 | 01 | 1986 | 01 | M | 46 | 11 | 1 | UU | 1 |
| 4 | 1999 | 01 | 1 | 08 | 2 | 01 | 01 | UU | 5 | 3 | ... | 99 | NN | NNNN | 01 | M | 05 | 99 | 2 | UU | 3 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 5860400 | 2014 | UU | U | UU | 2 | UU | UU | 01 | U | U | ... | 13 | 07 | UUUU | 01 | M | 24 | 11 | 1 | NN | 1 |
| 5860401 | 2014 | UU | U | 23 | 2 | 01 | 06 | 05 | 1 | 1 | ... | 01 | 14 | 2006 | 01 | M | 29 | 96 | 2 | 09 | 5 |
| 5860402 | 2014 | UU | U | 14 | 2 | 01 | 02 | 01 | 1 | 5 | ... | 01 | 01 | 2006 | 01 | F | UU | 11 | 2 | 01 | 1 |
| 5860403 | 2014 | UU | U | 22 | 1 | 01 | 06 | 01 | 2 | 4 | ... | 01 | 22 | UUUU | 01 | M | 67 | 12 | 3 | 01 | U |
| 5860404 | 2014 | UU | U | 22 | 1 | 01 | 06 | 01 | 2 | 4 | ... | 01 | 22 | UUUU | 02 | M | 10 | 98 | 1 | 01 | U |
5860405 rows × 22 columns
data.shape
(5860405, 22)
data.dtypes
C_YEAR int64 C_MNTH object C_WDAY object C_HOUR object C_SEV int64 C_VEHS object C_CONF object C_RCFG object C_WTHR object C_RSUR object C_RALN object C_TRAF object V_ID object V_TYPE object V_YEAR object P_ID object P_SEX object P_AGE object P_PSN object P_ISEV object P_SAFE object P_USER object dtype: object
Debido a la codificación en la que se entregan los datos, el primer paso será definir a qué tipo corresponde cada una de las variables y asignar los valores indeterminados (UU, XX, ...)
target = ['C_SEV']
numeric_features = ['C_YEAR', 'C_VEHS', 'V_YEAR','P_AGE']
useless_columns = ['P_ISEV', 'V_ID', 'P_ID']
categories = [col for col in data.columns if col not in target+numeric_features+useless_columns]
numeric_features
['C_YEAR', 'C_VEHS', 'V_YEAR', 'P_AGE']
data[numeric_features] = data[numeric_features].apply(pd.to_numeric, errors='coerce',downcast='integer')
data
| C_YEAR | C_MNTH | C_WDAY | C_HOUR | C_SEV | C_VEHS | C_CONF | C_RCFG | C_WTHR | C_RSUR | ... | V_ID | V_TYPE | V_YEAR | P_ID | P_SEX | P_AGE | P_PSN | P_ISEV | P_SAFE | P_USER | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1999 | 01 | 1 | 20 | 2 | 2.0 | 34 | UU | 1 | 5 | ... | 01 | 06 | 1990.0 | 01 | M | 41.0 | 11 | 1 | UU | 1 |
| 1 | 1999 | 01 | 1 | 20 | 2 | 2.0 | 34 | UU | 1 | 5 | ... | 02 | 01 | 1987.0 | 01 | M | 19.0 | 11 | 1 | UU | 1 |
| 2 | 1999 | 01 | 1 | 20 | 2 | 2.0 | 34 | UU | 1 | 5 | ... | 02 | 01 | 1987.0 | 02 | F | 20.0 | 13 | 2 | 02 | 2 |
| 3 | 1999 | 01 | 1 | 08 | 2 | 1.0 | 01 | UU | 5 | 3 | ... | 01 | 01 | 1986.0 | 01 | M | 46.0 | 11 | 1 | UU | 1 |
| 4 | 1999 | 01 | 1 | 08 | 2 | 1.0 | 01 | UU | 5 | 3 | ... | 99 | NN | NaN | 01 | M | 5.0 | 99 | 2 | UU | 3 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 5860400 | 2014 | UU | U | UU | 2 | NaN | UU | 01 | U | U | ... | 13 | 07 | NaN | 01 | M | 24.0 | 11 | 1 | NN | 1 |
| 5860401 | 2014 | UU | U | 23 | 2 | 1.0 | 06 | 05 | 1 | 1 | ... | 01 | 14 | 2006.0 | 01 | M | 29.0 | 96 | 2 | 09 | 5 |
| 5860402 | 2014 | UU | U | 14 | 2 | 1.0 | 02 | 01 | 1 | 5 | ... | 01 | 01 | 2006.0 | 01 | F | NaN | 11 | 2 | 01 | 1 |
| 5860403 | 2014 | UU | U | 22 | 1 | 1.0 | 06 | 01 | 2 | 4 | ... | 01 | 22 | NaN | 01 | M | 67.0 | 12 | 3 | 01 | U |
| 5860404 | 2014 | UU | U | 22 | 1 | 1.0 | 06 | 01 | 2 | 4 | ... | 01 | 22 | NaN | 02 | M | 10.0 | 98 | 1 | 01 | U |
5860405 rows × 22 columns
nas = ["UU","XX","U","X","XXXX","NN","NNNN","N","UUUU"]
others = {
"P_SAFE" : "14",
"P_PSN": "100",
"V_TYPE": "24",
"C_TRAF": "19",
"C_RALN": "7",
"C_RSUR": "10",
"C_WTHR": "8",
"C_RCFG": "13",
"C_CONF": "42"
}
# aplicamos la función a todo el dataset
data[categories] = data[categories].replace(to_replace=rf'^({"|".join(nas)})',value=np.nan,regex=True)
for key,value in others.items():
data[key] = data[key].replace(to_replace=r'^Q.*', value=value, regex=True)
data[categories] = data[categories].astype('category')
data.dtypes
C_YEAR int16 C_MNTH category C_WDAY category C_HOUR category C_SEV int64 C_VEHS float64 C_CONF category C_RCFG category C_WTHR category C_RSUR category C_RALN category C_TRAF category V_ID object V_TYPE category V_YEAR float64 P_ID object P_SEX category P_AGE float64 P_PSN category P_ISEV object P_SAFE category P_USER category dtype: object
data[target] = data[target].replace({1:1, 2:0})
data[target] = data[target].astype('category')
pd_plot_target = data['C_SEV']\
.value_counts(normalize=True)\
.mul(100).rename('percent').reset_index()
pd_plot_target_conteo = data['C_SEV'].value_counts().reset_index()
pd_plot_target_pc = pd.merge(pd_plot_target, pd_plot_target_conteo, on=['index'], how='inner')
fig = px.histogram(pd_plot_target_pc, x="index", y=['percent'])
fig.show()
Como se puede observar en el gráfico, la variable objetivo está muy desbalanceada en la muestra siendo mayoritarios los accidentes (98%) que no causan ninguna fatalidad, mientras que en el restante 2% sí existió al menos un fallecido.
for i in data.columns:
if (i in numeric_features):
plot_feature(data, col_name=i, isContinuous=True, target='C_SEV')
elif (i in categories):
plot_feature(data.dropna(subset=[i]), col_name=i, isContinuous=False, target='C_SEV')
Análisis de las variables más significativas:
Para detectar los valores atípicos de las variables numéricas, tras el análisis gráfico anterior, se van a calcular como aquellos valores que excedan el valor medio más menos tres veces la desviación típica, ya que en casi todas las variables se perdería mucha información si se tomaran los límites máximo y mínimo del gráfico de cajas anteriormente representado.
get_deviation_of_mean_perc(data, numeric_features, target='C_SEV', multiplier=3)
| 0.0 | 1.0 | variable | sum_outlier_values | porcentaje_sum_null_values | |
|---|---|---|---|---|---|
| 0 | 0.942584 | 0.057416 | C_VEHS | 32395 | 0.005528 |
| 1 | 0.969529 | 0.030471 | V_YEAR | 21496 | 0.003668 |
| 2 | 0.950959 | 0.049041 | P_AGE | 3283 | 0.000560 |
Tras el análisis, se considerarán como outliers:
Estos valores se tratarán en función de la variable a la que pertenecen, de tal forma que se anulen a la hora de agrupar el dataset por accidente, ya que no son capaces de discriminar a la variable objetivo. Es decir, en el caso de C_VEHS se les asignará el valor 0 y en el caso de V_YEAR se les asignará la media del resto de valores.
# Variable C_VEHS
data["C_VEHS"] = np.where(data["C_VEHS"] > 15, 0, data["C_VEHS"])
Para el caso de V_YEAR, los outliers se considerarán como NA y se tratarán posteriormente de manera conjunta bajo la misma estarategia de imputación
# Variable V_YEAR
data["V_YEAR"] = np.where(data["V_YEAR"] < 1960, np.nan, data["V_YEAR"])
En primer lugar, se deberá analizar qué cantidad de nulos tiene cada variable, así como su representación de entre todos los registros de la misma. Además, es conveniente analizar si estos valores tienen capacidad discriminatoria de la variable objetivo o, si, por el contrario, matienen el mismo porcentaje que la distribución en la muestra.
NA_df = get_percent_null_values_target(data, numeric_features+categories, target='C_SEV')
NA_df
| 0.0 | 1.0 | variable | Number_of_NA | NA_percentage | |
|---|---|---|---|---|---|
| 0 | 0.982871 | 0.017129 | P_SAFE | 1187134 | 0.202569 |
| 1 | 0.979691 | 0.020309 | V_YEAR | 585796 | 0.099958 |
| 2 | 0.988144 | 0.011856 | C_RCFG | 504648 | 0.086111 |
| 3 | 0.992505 | 0.007495 | C_RALN | 434710 | 0.074177 |
| 4 | 0.989652 | 0.010348 | P_AGE | 395156 | 0.067428 |
| 5 | 0.973264 | 0.026736 | V_TYPE | 257667 | 0.043967 |
| 6 | 0.988408 | 0.011592 | P_SEX | 249740 | 0.042615 |
| 7 | 0.985610 | 0.014390 | C_TRAF | 223483 | 0.038134 |
| 8 | 0.985873 | 0.014127 | C_CONF | 179019 | 0.030547 |
| 16 | 0.975442 | 0.024558 | P_USER | 175586 | 0.029961 |
| 9 | 0.991100 | 0.008900 | C_WTHR | 87975 | 0.015012 |
| 10 | 0.989777 | 0.010223 | C_RSUR | 78451 | 0.013387 |
| 11 | 0.978932 | 0.021068 | P_PSN | 71815 | 0.012254 |
| 12 | 0.981838 | 0.018162 | C_HOUR | 59409 | 0.010137 |
| 13 | 0.990174 | 0.009826 | C_WDAY | 1323 | 0.000226 |
| 14 | 0.970588 | 0.029412 | C_VEHS | 544 | 0.000093 |
| 15 | 0.989610 | 0.010390 | C_MNTH | 385 | 0.000066 |
Los NA de las distintas variables no tienen un poder discriminatorio sobre la variable objetivo significativo, ya que mantiene en todos los casos la distribución de sus valores. Además a excepeción de la variable P_SAFE y la variable V_YEAR no representan un número significativo dentro de las observaciones de las distintas variables. Por todo ello, se han decidido realizar las imputaciones de la siguiente manera:
categorical_na_imputer('P_SAFE', data, NA_df)
data
| C_YEAR | C_MNTH | C_WDAY | C_HOUR | C_SEV | C_VEHS | C_CONF | C_RCFG | C_WTHR | C_RSUR | ... | V_ID | V_TYPE | V_YEAR | P_ID | P_SEX | P_AGE | P_PSN | P_ISEV | P_SAFE | P_USER | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1999 | 01 | 1 | 20 | 0 | 2.0 | 34 | NaN | 1 | 5 | ... | 01 | 06 | 1990.0 | 01 | M | 41.0 | 11 | 1 | 01 | 1 |
| 1 | 1999 | 01 | 1 | 20 | 0 | 2.0 | 34 | NaN | 1 | 5 | ... | 02 | 01 | 1987.0 | 01 | M | 19.0 | 11 | 1 | 02 | 1 |
| 2 | 1999 | 01 | 1 | 20 | 0 | 2.0 | 34 | NaN | 1 | 5 | ... | 02 | 01 | 1987.0 | 02 | F | 20.0 | 13 | 2 | 02 | 2 |
| 3 | 1999 | 01 | 1 | 08 | 0 | 1.0 | 01 | NaN | 5 | 3 | ... | 01 | 01 | 1986.0 | 01 | M | 46.0 | 11 | 1 | 02 | 1 |
| 4 | 1999 | 01 | 1 | 08 | 0 | 1.0 | 01 | NaN | 5 | 3 | ... | 99 | NaN | NaN | 01 | M | 5.0 | 99 | 2 | 02 | 3 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 5860400 | 2014 | NaN | NaN | NaN | 0 | NaN | NaN | 01 | NaN | NaN | ... | 13 | 07 | NaN | 01 | M | 24.0 | 11 | 1 | 02 | 1 |
| 5860401 | 2014 | NaN | NaN | 23 | 0 | 1.0 | 06 | 05 | 1 | 1 | ... | 01 | 14 | 2006.0 | 01 | M | 29.0 | 96 | 2 | 09 | 5 |
| 5860402 | 2014 | NaN | NaN | 14 | 0 | 1.0 | 02 | 01 | 1 | 5 | ... | 01 | 01 | 2006.0 | 01 | F | NaN | 11 | 2 | 01 | 1 |
| 5860403 | 2014 | NaN | NaN | 22 | 1 | 1.0 | 06 | 01 | 2 | 4 | ... | 01 | 22 | NaN | 01 | M | 67.0 | 12 | 3 | 01 | NaN |
| 5860404 | 2014 | NaN | NaN | 22 | 1 | 1.0 | 06 | 01 | 2 | 4 | ... | 01 | 22 | NaN | 02 | M | 10.0 | 98 | 1 | 01 | NaN |
5860405 rows × 22 columns
categories_mode_inputer = []
for i in categories:
if i != 'P_SAFE':
categories_mode_inputer.append(i)
categories_mode_inputer
['C_MNTH', 'C_WDAY', 'C_HOUR', 'C_CONF', 'C_RCFG', 'C_WTHR', 'C_RSUR', 'C_RALN', 'C_TRAF', 'V_TYPE', 'P_SEX', 'P_PSN', 'P_USER']
imp_mode = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
data[categories_mode_inputer] = imp_mode.fit_transform(data[categories_mode_inputer])
data
| C_YEAR | C_MNTH | C_WDAY | C_HOUR | C_SEV | C_VEHS | C_CONF | C_RCFG | C_WTHR | C_RSUR | ... | V_ID | V_TYPE | V_YEAR | P_ID | P_SEX | P_AGE | P_PSN | P_ISEV | P_SAFE | P_USER | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1999 | 01 | 1 | 20 | 0 | 2.0 | 34 | 02 | 1 | 5 | ... | 01 | 06 | 1990.0 | 01 | M | 41.0 | 11 | 1 | 01 | 1 |
| 1 | 1999 | 01 | 1 | 20 | 0 | 2.0 | 34 | 02 | 1 | 5 | ... | 02 | 01 | 1987.0 | 01 | M | 19.0 | 11 | 1 | 02 | 1 |
| 2 | 1999 | 01 | 1 | 20 | 0 | 2.0 | 34 | 02 | 1 | 5 | ... | 02 | 01 | 1987.0 | 02 | F | 20.0 | 13 | 2 | 02 | 2 |
| 3 | 1999 | 01 | 1 | 08 | 0 | 1.0 | 01 | 02 | 5 | 3 | ... | 01 | 01 | 1986.0 | 01 | M | 46.0 | 11 | 1 | 02 | 1 |
| 4 | 1999 | 01 | 1 | 08 | 0 | 1.0 | 01 | 02 | 5 | 3 | ... | 99 | 01 | NaN | 01 | M | 5.0 | 99 | 2 | 02 | 3 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 5860400 | 2014 | 08 | 5 | 16 | 0 | NaN | 21 | 01 | 1 | 1 | ... | 13 | 07 | NaN | 01 | M | 24.0 | 11 | 1 | 02 | 1 |
| 5860401 | 2014 | 08 | 5 | 23 | 0 | 1.0 | 06 | 05 | 1 | 1 | ... | 01 | 14 | 2006.0 | 01 | M | 29.0 | 96 | 2 | 09 | 5 |
| 5860402 | 2014 | 08 | 5 | 14 | 0 | 1.0 | 02 | 01 | 1 | 5 | ... | 01 | 01 | 2006.0 | 01 | F | NaN | 11 | 2 | 01 | 1 |
| 5860403 | 2014 | 08 | 5 | 22 | 1 | 1.0 | 06 | 01 | 2 | 4 | ... | 01 | 22 | NaN | 01 | M | 67.0 | 12 | 3 | 01 | 1 |
| 5860404 | 2014 | 08 | 5 | 22 | 1 | 1.0 | 06 | 01 | 2 | 4 | ... | 01 | 22 | NaN | 02 | M | 10.0 | 98 | 1 | 01 | 1 |
5860405 rows × 22 columns
lista_años = data.C_YEAR[data['V_YEAR'].isna() == True].unique().tolist()
for i in lista_años:
valores = data.V_YEAR[(data['C_YEAR'] == i) & (data['V_YEAR'].notna() == True)].values.tolist()
media_valores = sum(valores)/len(valores)
data['V_YEAR'] = np.where((data['C_YEAR'] == i) & (data['V_YEAR'].isna() == True), media_valores, data.V_YEAR)
data
| C_YEAR | C_MNTH | C_WDAY | C_HOUR | C_SEV | C_VEHS | C_CONF | C_RCFG | C_WTHR | C_RSUR | ... | V_ID | V_TYPE | V_YEAR | P_ID | P_SEX | P_AGE | P_PSN | P_ISEV | P_SAFE | P_USER | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1999 | 01 | 1 | 20 | 0 | 2.0 | 34 | 02 | 1 | 5 | ... | 01 | 06 | 1990.000000 | 01 | M | 41.0 | 11 | 1 | 01 | 1 |
| 1 | 1999 | 01 | 1 | 20 | 0 | 2.0 | 34 | 02 | 1 | 5 | ... | 02 | 01 | 1987.000000 | 01 | M | 19.0 | 11 | 1 | 02 | 1 |
| 2 | 1999 | 01 | 1 | 20 | 0 | 2.0 | 34 | 02 | 1 | 5 | ... | 02 | 01 | 1987.000000 | 02 | F | 20.0 | 13 | 2 | 02 | 2 |
| 3 | 1999 | 01 | 1 | 08 | 0 | 1.0 | 01 | 02 | 5 | 3 | ... | 01 | 01 | 1986.000000 | 01 | M | 46.0 | 11 | 1 | 02 | 1 |
| 4 | 1999 | 01 | 1 | 08 | 0 | 1.0 | 01 | 02 | 5 | 3 | ... | 99 | 01 | 1991.595226 | 01 | M | 5.0 | 99 | 2 | 02 | 3 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 5860400 | 2014 | 08 | 5 | 16 | 0 | NaN | 21 | 01 | 1 | 1 | ... | 13 | 07 | 2006.234884 | 01 | M | 24.0 | 11 | 1 | 02 | 1 |
| 5860401 | 2014 | 08 | 5 | 23 | 0 | 1.0 | 06 | 05 | 1 | 1 | ... | 01 | 14 | 2006.000000 | 01 | M | 29.0 | 96 | 2 | 09 | 5 |
| 5860402 | 2014 | 08 | 5 | 14 | 0 | 1.0 | 02 | 01 | 1 | 5 | ... | 01 | 01 | 2006.000000 | 01 | F | NaN | 11 | 2 | 01 | 1 |
| 5860403 | 2014 | 08 | 5 | 22 | 1 | 1.0 | 06 | 01 | 2 | 4 | ... | 01 | 22 | 2006.234884 | 01 | M | 67.0 | 12 | 3 | 01 | 1 |
| 5860404 | 2014 | 08 | 5 | 22 | 1 | 1.0 | 06 | 01 | 2 | 4 | ... | 01 | 22 | 2006.234884 | 02 | M | 10.0 | 98 | 1 | 01 | 1 |
5860405 rows × 22 columns
categories_mean_inputer = []
for i in numeric_features:
if i != 'V_YEAR':
categories_mean_inputer.append(i)
categories_mean_inputer
['C_YEAR', 'C_VEHS', 'P_AGE']
imp_mean = SimpleImputer(missing_values=np.nan, strategy='median')
data[categories_mean_inputer] = imp_mean.fit_transform(data[categories_mean_inputer])
data
| C_YEAR | C_MNTH | C_WDAY | C_HOUR | C_SEV | C_VEHS | C_CONF | C_RCFG | C_WTHR | C_RSUR | ... | V_ID | V_TYPE | V_YEAR | P_ID | P_SEX | P_AGE | P_PSN | P_ISEV | P_SAFE | P_USER | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1999.0 | 01 | 1 | 20 | 0 | 2.0 | 34 | 02 | 1 | 5 | ... | 01 | 06 | 1990.000000 | 01 | M | 41.0 | 11 | 1 | 01 | 1 |
| 1 | 1999.0 | 01 | 1 | 20 | 0 | 2.0 | 34 | 02 | 1 | 5 | ... | 02 | 01 | 1987.000000 | 01 | M | 19.0 | 11 | 1 | 02 | 1 |
| 2 | 1999.0 | 01 | 1 | 20 | 0 | 2.0 | 34 | 02 | 1 | 5 | ... | 02 | 01 | 1987.000000 | 02 | F | 20.0 | 13 | 2 | 02 | 2 |
| 3 | 1999.0 | 01 | 1 | 08 | 0 | 1.0 | 01 | 02 | 5 | 3 | ... | 01 | 01 | 1986.000000 | 01 | M | 46.0 | 11 | 1 | 02 | 1 |
| 4 | 1999.0 | 01 | 1 | 08 | 0 | 1.0 | 01 | 02 | 5 | 3 | ... | 99 | 01 | 1991.595226 | 01 | M | 5.0 | 99 | 2 | 02 | 3 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 5860400 | 2014.0 | 08 | 5 | 16 | 0 | 2.0 | 21 | 01 | 1 | 1 | ... | 13 | 07 | 2006.234884 | 01 | M | 24.0 | 11 | 1 | 02 | 1 |
| 5860401 | 2014.0 | 08 | 5 | 23 | 0 | 1.0 | 06 | 05 | 1 | 1 | ... | 01 | 14 | 2006.000000 | 01 | M | 29.0 | 96 | 2 | 09 | 5 |
| 5860402 | 2014.0 | 08 | 5 | 14 | 0 | 1.0 | 02 | 01 | 1 | 5 | ... | 01 | 01 | 2006.000000 | 01 | F | 34.0 | 11 | 2 | 01 | 1 |
| 5860403 | 2014.0 | 08 | 5 | 22 | 1 | 1.0 | 06 | 01 | 2 | 4 | ... | 01 | 22 | 2006.234884 | 01 | M | 67.0 | 12 | 3 | 01 | 1 |
| 5860404 | 2014.0 | 08 | 5 | 22 | 1 | 1.0 | 06 | 01 | 2 | 4 | ... | 01 | 22 | 2006.234884 | 02 | M | 10.0 | 98 | 1 | 01 | 1 |
5860405 rows × 22 columns
data.dtypes
C_YEAR float64 C_MNTH object C_WDAY object C_HOUR object C_SEV category C_VEHS float64 C_CONF object C_RCFG object C_WTHR object C_RSUR object C_RALN object C_TRAF object V_ID object V_TYPE object V_YEAR float64 P_ID object P_SEX object P_AGE float64 P_PSN object P_ISEV object P_SAFE category P_USER object dtype: object
data[numeric_features] = data[numeric_features].astype(int)
data
| C_YEAR | C_MNTH | C_WDAY | C_HOUR | C_SEV | C_VEHS | C_CONF | C_RCFG | C_WTHR | C_RSUR | ... | V_ID | V_TYPE | V_YEAR | P_ID | P_SEX | P_AGE | P_PSN | P_ISEV | P_SAFE | P_USER | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1999 | 01 | 1 | 20 | 0 | 2 | 34 | 02 | 1 | 5 | ... | 01 | 06 | 1990 | 01 | M | 41 | 11 | 1 | 01 | 1 |
| 1 | 1999 | 01 | 1 | 20 | 0 | 2 | 34 | 02 | 1 | 5 | ... | 02 | 01 | 1987 | 01 | M | 19 | 11 | 1 | 02 | 1 |
| 2 | 1999 | 01 | 1 | 20 | 0 | 2 | 34 | 02 | 1 | 5 | ... | 02 | 01 | 1987 | 02 | F | 20 | 13 | 2 | 02 | 2 |
| 3 | 1999 | 01 | 1 | 08 | 0 | 1 | 01 | 02 | 5 | 3 | ... | 01 | 01 | 1986 | 01 | M | 46 | 11 | 1 | 02 | 1 |
| 4 | 1999 | 01 | 1 | 08 | 0 | 1 | 01 | 02 | 5 | 3 | ... | 99 | 01 | 1991 | 01 | M | 5 | 99 | 2 | 02 | 3 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 5860400 | 2014 | 08 | 5 | 16 | 0 | 2 | 21 | 01 | 1 | 1 | ... | 13 | 07 | 2006 | 01 | M | 24 | 11 | 1 | 02 | 1 |
| 5860401 | 2014 | 08 | 5 | 23 | 0 | 1 | 06 | 05 | 1 | 1 | ... | 01 | 14 | 2006 | 01 | M | 29 | 96 | 2 | 09 | 5 |
| 5860402 | 2014 | 08 | 5 | 14 | 0 | 1 | 02 | 01 | 1 | 5 | ... | 01 | 01 | 2006 | 01 | F | 34 | 11 | 2 | 01 | 1 |
| 5860403 | 2014 | 08 | 5 | 22 | 1 | 1 | 06 | 01 | 2 | 4 | ... | 01 | 22 | 2006 | 01 | M | 67 | 12 | 3 | 01 | 1 |
| 5860404 | 2014 | 08 | 5 | 22 | 1 | 1 | 06 | 01 | 2 | 4 | ... | 01 | 22 | 2006 | 02 | M | 10 | 98 | 1 | 01 | 1 |
5860405 rows × 22 columns
data.isna().sum()
C_YEAR 0 C_MNTH 0 C_WDAY 0 C_HOUR 0 C_SEV 0 C_VEHS 0 C_CONF 0 C_RCFG 0 C_WTHR 0 C_RSUR 0 C_RALN 0 C_TRAF 0 V_ID 0 V_TYPE 0 V_YEAR 0 P_ID 0 P_SEX 0 P_AGE 0 P_PSN 0 P_ISEV 0 P_SAFE 0 P_USER 0 dtype: int64
# Eliminar las columna que no aportan información al problema
data = data.drop(useless_columns, axis=1)
data
| C_YEAR | C_MNTH | C_WDAY | C_HOUR | C_SEV | C_VEHS | C_CONF | C_RCFG | C_WTHR | C_RSUR | C_RALN | C_TRAF | V_TYPE | V_YEAR | P_SEX | P_AGE | P_PSN | P_SAFE | P_USER | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1999 | 01 | 1 | 20 | 0 | 2 | 34 | 02 | 1 | 5 | 3 | 03 | 06 | 1990 | M | 41 | 11 | 01 | 1 |
| 1 | 1999 | 01 | 1 | 20 | 0 | 2 | 34 | 02 | 1 | 5 | 3 | 03 | 01 | 1987 | M | 19 | 11 | 02 | 1 |
| 2 | 1999 | 01 | 1 | 20 | 0 | 2 | 34 | 02 | 1 | 5 | 3 | 03 | 01 | 1987 | F | 20 | 13 | 02 | 2 |
| 3 | 1999 | 01 | 1 | 08 | 0 | 1 | 01 | 02 | 5 | 3 | 6 | 18 | 01 | 1986 | M | 46 | 11 | 02 | 1 |
| 4 | 1999 | 01 | 1 | 08 | 0 | 1 | 01 | 02 | 5 | 3 | 6 | 18 | 01 | 1991 | M | 5 | 99 | 02 | 3 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 5860400 | 2014 | 08 | 5 | 16 | 0 | 2 | 21 | 01 | 1 | 1 | 1 | 18 | 07 | 2006 | M | 24 | 11 | 02 | 1 |
| 5860401 | 2014 | 08 | 5 | 23 | 0 | 1 | 06 | 05 | 1 | 1 | 1 | 18 | 14 | 2006 | M | 29 | 96 | 09 | 5 |
| 5860402 | 2014 | 08 | 5 | 14 | 0 | 1 | 02 | 01 | 1 | 5 | 4 | 18 | 01 | 2006 | F | 34 | 11 | 01 | 1 |
| 5860403 | 2014 | 08 | 5 | 22 | 1 | 1 | 06 | 01 | 2 | 4 | 7 | 18 | 22 | 2006 | M | 67 | 12 | 01 | 1 |
| 5860404 | 2014 | 08 | 5 | 22 | 1 | 1 | 06 | 01 | 2 | 4 | 7 | 18 | 22 | 2006 | M | 10 | 98 | 01 | 1 |
5860405 rows × 19 columns
# Guardar el dataset transformado
data.to_csv("data/pd_data_EDA_missing_outlier.csv", index=False)